In this notebook, we will construct a speech dataset, implement an algorithm for wake word detection (sometimes also called keyword detection, or trigger word detection) and provide a base for real time demo.

In this project we will:
import random
import sys
import io
import os
import glob
import IPython
import wave
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline
from scipy.io import wavfile
from pydub import AudioSegment
Below we have defined some helper functions that will be used in loading and creating speech dataset.
# Calculate and plot spectrogram for a wav audio file
def graph_spectrogram(wav_file):
rate, data = get_wav_info(wav_file)
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
nchannels = data.ndim
if nchannels == 1:
pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
elif nchannels == 2:
pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
return pxx
# Load a wav file
def get_wav_info(wav_file):
rate, data = wavfile.read(wav_file)
return rate, data
# Used to standardize volume of audio clip
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
# Load raw audio files for speech synthesis
def load_raw_audio():
positives = []
backgrounds = []
negatives = []
for filename in os.listdir("./raw_data/positive"):
if filename.endswith("wav"):
positive = AudioSegment.from_wav("./raw_data/positive/"+filename)
positives.append(positive)
for filename in os.listdir("./raw_data/backgrounds"):
if filename.endswith("wav"):
background = AudioSegment.from_wav("./raw_data/backgrounds/"+filename)
backgrounds.append(background)
for filename in os.listdir("./raw_data/negative"):
if filename.endswith("wav"):
negative = AudioSegment.from_wav("./raw_data/negative/"+filename)
negatives.append(negative)
return positives, negatives, backgrounds
Let's start by building a dataset for our trigger word detection algorithm.
We can run the cells below to listen to some examples.
IPython.display.Audio("./raw_data/positive/1.wav")
IPython.display.Audio("./raw_data/positive/2.wav")
What really is an audio recording?
x = graph_spectrogram("./audio_examples/example_train.wav")
The graph above represents how active each frequency is (y axis) over a number of time-steps (x axis).

_, data = wavfile.read("audio_examples/example_train.wav")
print("Time steps in audio recording before spectrogram", data[:,0].shape)
print("Time steps in input after spectrogram", x.shape)
Now, we can define:
Tx = 5511 # The number of time steps input to the model from the spectrogram
n_freq = 101 # Number of frequencies input to the model at each time step of the spectrogram
Note that we may divide a 10 second interval of time with different units (steps).
pydub to synthesize audio, and it divides 10 seconds into 10,000 units.Ty = 1375 # The number of time steps in the output of our model
Because speech data is hard to acquire and label, we will synthesize our training data using the audio clips of activates, negatives, and backgrounds.
# Load audio segments using pydub
positives, negatives, backgrounds = load_raw_audio()
print("background len should be 10,000, since it is a 10 sec clip\n" + str(len(backgrounds[0])),"\n")
print("activate[0] len may be around 1000, since an `activate` audio clip is usually around 1 second (but varies a lot) \n" + str(len(positives[0])),"\n")
print("activate[1] len: different `activate` clips can have different lengths\n" + str(len(positives[1])),"\n")
int(1375*0.5) corresponds to the moment 5 seconds into the audio clip. 
get_random_time_segment(segment_ms) returns a random time segment onto which we can insert an audio clip of duration segment_ms.def get_random_time_segment(segment_ms):
"""
Gets a random time segment of duration segment_ms in a 10,000 ms audio clip.
Arguments:
segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
Returns:
segment_time -- a tuple of (segment_start, segment_end) in ms
"""
segment_start = np.random.randint(low=0, high=10000-segment_ms) # Make sure segment doesn't run past the 10sec background
segment_end = segment_start + segment_ms - 1
return (segment_start, segment_end)
Implementing is_overlapping(segment_time, existing_segments)
def is_overlapping(segment_time, previous_segments):
"""
Checks if the time of a segment overlaps with the times of existing segments.
Arguments:
segment_time -- a tuple of (segment_start, segment_end) for the new segment
previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments
Returns:
True if the time segment overlaps with any of the existing segments, False otherwise
"""
segment_start, segment_end = segment_time
# Initialize overlap as a "False" flag
overlap = False
# Compare start/end times and set the flag to True if there is an overlap
for previous_start, previous_end in previous_segments:
if segment_start <= previous_end and segment_end >= previous_start:
overlap = True
return overlap
# UNIT TEST
def is_overlapping_test(target):
assert target((670, 1430), []) == False, "Overlap with an empty list must be False"
assert target((500, 1000), [(100, 499), (1001, 1100)]) == False, "Almost overlap, but still False"
assert target((750, 1250), [(100, 750), (1001, 1100)]) == True, "Must overlap with the end of first segment"
assert target((750, 1250), [(300, 600), (1250, 1500)]) == True, "Must overlap with the begining of second segment"
assert target((750, 1250), [(300, 600), (600, 1500), (1600, 1800)]) == True, "Is contained in second segment"
print("\033[92m All tests passed!")
is_overlapping_test(is_overlapping)
overlap1 = is_overlapping((950, 1430), [(2000, 2550), (260, 949)])
overlap2 = is_overlapping((2305, 2950), [(824, 1532), (1900, 2305), (3424, 3656)])
print("Overlap 1 = ", overlap1)
print("Overlap 2 = ", overlap2)
Implementing insert_audio_clip():
def insert_audio_clip(background, audio_clip, previous_segments):
"""
Insert a new audio segment over the background noise at a random time step, ensuring that the
audio segment does not overlap with existing segments.
Arguments:
background -- a 10 second background audio recording.
audio_clip -- the audio clip to be inserted/overlaid.
previous_segments -- times where audio segments have already been placed
Returns:
new_background -- the updated background audio
"""
# Get the duration of the audio clip in ms
segment_ms = len(audio_clip)
# Using one of the helper functions to pick a random time segment onto which to insert
# the new audio clip.
segment_time = get_random_time_segment(segment_ms)
# Check if the new segment_time overlaps with one of the previous_segments. If so, keep
# picking new segment_time at random until it doesn't overlap.
while is_overlapping(segment_time, previous_segments):
segment_time = get_random_time_segment(segment_ms)
# Append the new segment_time to the list of previous_segments
previous_segments.append(segment_time)
# Step 4: Superpose audio segment and background
new_background = background.overlay(audio_clip, position = segment_time[0])
return new_background, segment_time
np.random.seed(5)
audio_clip, segment_time = insert_audio_clip(backgrounds[0], positives[0], [(3790, 4400)])
audio_clip.export("insert_test.wav", format="wav")
print("Segment Time: ", segment_time)
IPython.display.Audio("insert_test.wav")
y is a (1,1375) dimensional vector, since $T_y = 1375$. y[0][1375], since the valid indices are y[0][0] through y[0][1374] because $T_y = 1375$. y[0][1371] = y[0][1372] = y[0][1373] = y[0][1374] = 1Implementing insert_ones()
segment_end_ms (using a 10000 step discretization),segment_end_y = int(segment_end_ms * Ty / 10000.0)
def insert_ones(y, segment_end_ms):
"""
Update the label vector y. The labels of the 50 output steps strictly after the end of the segment
should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the
50 following labels should be ones.
Arguments:
y -- numpy array of shape (1, Ty), the labels of the training example
segment_end_ms -- the end time of the segment in ms
Returns:
y -- updated labels
"""
# duration of the background (in terms of spectrogram time-steps)
segment_end_y = int(segment_end_ms * Ty / 10000.0)
# Add 1 to the correct index in the background label (y)
for i in range(segment_end_y + 1, segment_end_y + 51):
if i < Ty:
y[0, i] = 1.0
return y
arr1 = insert_ones(np.zeros((1, Ty)), 9700)
plt.plot(insert_ones(arr1, 4251)[0,:])
print("sanity checks:", arr1[0][1333], arr1[0][634], arr1[0][635])
Finally, we can use insert_audio_clip and insert_ones to create a new training example.
Implementing create_training_example(). We will need to carry out the following steps:
def create_training_example(background, activates, negatives):
"""
Creates a training example with a given background, activates, and negatives.
Arguments:
background -- a 10 second background audio recording
activates -- a list of audio segments of the word "activate"
negatives -- a list of audio segments of random words that are not "activate"
Returns:
x -- the spectrogram of the training example
y -- the label at each time step of the spectrogram
"""
np.random.seed(18)
# Make background quieter
background = background - 20
# Initialize y (label vector) of zeros
y = np.zeros((1,Ty))
# Initialize segment times as an empty list
previous_segments = []
# Select 0-4 random "activate" audio clips from the entire list of "activates" recordings
number_of_activates = np.random.randint(0, 5)
random_indices = np.random.randint(len(activates), size=number_of_activates)
random_activates = [activates[i] for i in random_indices]
# Loop over randomly selected "activate" clips and insert in background
for random_activate in random_activates:
# Insert the audio clip on the background
background, segment_time = insert_audio_clip(background, random_activate, previous_segments)
# Retrieve segment_start and segment_end from segment_time
segment_start, segment_end = segment_time
# Insert labels in "y"
y = insert_ones(y, segment_end)
# Select 0-2 random negatives audio recordings from the entire list of "negatives" recordings
number_of_negatives = np.random.randint(0, 3)
random_indices = np.random.randint(len(negatives), size=number_of_negatives)
random_negatives = [negatives[i] for i in random_indices]
# Loop over randomly selected negative clips and insert in background
for random_negative in random_negatives:
# Insert the audio clip on the background
background, _ = insert_audio_clip(background, random_negative, previous_segments)
# Standardize the volume of the audio clip
background = match_target_amplitude(background, -20.0)
# Export new training example
file_handle = background.export("train" + ".wav", format="wav")
print("File (train.wav) was saved in your directory.")
# Get and plot spectrogram of the new recording (background with superposition of positive and negatives)
x = graph_spectrogram("train.wav")
return x, y
x, y = create_training_example(backgrounds[0], positives, negatives)
Now we can listen to the training example we just created and compare it to the spectrogram generated above.
IPython.display.Audio("train.wav")
plt.plot(y[0])
np.random.seed(4543)
nsamples = 32
X = []
Y = []
for i in range(0, nsamples):
if i%10 == 0:
print(i)
x, y = create_training_example(backgrounds[i % 2], positives, negatives)
X.append(x.swapaxes(0,1))
Y.append(y.swapaxes(0,1))
X = np.array(X)
Y = np.array(Y)
X.shape, Y.shape
# Save the data for further uses
# np.save(f'./XY_train/X.npy', X)
# np.save(f'./XY_train/Y.npy', Y)
# Load the preprocessed training examples
# X = np.load("./XY_train/X.npy")
# Y = np.load("./XY_train/Y.npy")
# Load preprocessed dev set examples
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from tensorflow.keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
Our goal is to build a network that will ingest a spectrogram and output a signal when it detects the trigger word. This network will use 4 layers:
* A convolutional layer
* Two GRU layers
* A dense layer.
Here is the architecture we will use.

One key layer of this model is the 1D convolutional step (near the bottom of Figure 3).
# GRADED FUNCTION: model
def model(input_shape):
"""
Function creating the model's graph in Keras.
Argument:
input_shape -- shape of the model's input data (using Keras conventions)
Returns:
model -- Keras model instance
"""
X_input = Input(shape = input_shape)
# CONV layer
X = Conv1D(filters=196,kernel_size=15,strides=4)(X_input) # CONV1D
X = BatchNormalization()(X) # Batch normalization
X = Activation("relu")(X) # ReLu activation
X = Dropout(rate=0.8)(X) # dropout (use 0.8)
# First GRU Layer
X = GRU(units=128, return_sequences = True)(X) # GRU (use 128 units and return the sequences) , reset_after=True
X = Dropout(rate=0.8)(X) # dropout (use 0.8)
X = BatchNormalization()(X) # Batch normalization
# Second GRU Layer
X = GRU(units=128, return_sequences = True)(X) # GRU (use 128 units and return the sequences) , reset_after=True
X = Dropout(rate=0.8)(X) # dropout (use 0.8)
X = BatchNormalization()(X) # Batch normalization
X = Dropout(rate=0.8)(X) # dropout (use 0.8)
# Time-distributed dense layer
X = TimeDistributed(Dense(1, activation = "sigmoid"))(X) # time distributed (sigmoid)
model = Model(inputs = X_input, outputs = X)
return model
model = model(input_shape = (Tx, n_freq))
model.summary()
The output of the network is of shape (None, 1375, 1) while the input is (None, 5511, 101). The Conv1D has reduced the number of steps from 5511 to 1375.
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])
model.fit(X, Y, batch_size = 5, epochs=100)
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
model = tf.compat.v1.keras.models.load_model('./models/tr_model.h5')
# model = load_model('./models/tr_model.h5')
Finally, let's see how your model performs on the dev set.
loss, acc = model.evaluate(X_dev, Y_dev)
print("Dev set accuracy = ", acc)
This looks pretty good!
Now that you have built a working model for trigger word detection, let's use it to make predictions. This code snippet runs audio (saved in a wav file) through the network.
def detect_triggerword(filename):
plt.subplot(2, 1, 1)
x = graph_spectrogram(filename)
# the spectrogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model
x = x.swapaxes(0,1)
x = np.expand_dims(x, axis=0)
predictions = model.predict(x)
plt.subplot(2, 1, 2)
plt.plot(predictions[0,:,0])
plt.ylabel('probability')
plt.show()
return predictions
Implementing chime_on_activate()
We use this code to convert from the 1,375 step discretization to the 10,000 step discretization and insert a "chime" using pydub:
audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio.duration_seconds)*1000)
chime_file = "audio_examples/chime.wav"
def chime_on_activate(filename, predictions, threshold):
audio_clip = AudioSegment.from_wav(filename)
chime = AudioSegment.from_wav(chime_file)
Ty = predictions.shape[1]
# Initialize the number of consecutive output steps to 0
consecutive_timesteps = 0
# Loop over the output steps in the y
for i in range(Ty):
# Increment consecutive output steps
consecutive_timesteps += 1
# If prediction is higher than the threshold and more than 75 consecutive output steps have passed
if predictions[0,i,0] > threshold and consecutive_timesteps > 75:
# Superpose audio and background using pydub
audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio_clip.duration_seconds)*1000)
# Reset consecutive output steps to 0
consecutive_timesteps = 0
audio_clip.export("chime_output.wav", format='wav')
Let's explore how our model performs on two unseen audio clips from the development set. Lets first listen to the two dev set clips.
IPython.display.Audio("./raw_data/dev/Recording.wav")
Now lets run the model on these audio clips and see if it adds a chime after "activate"!
filename = "./raw_data/dev/Recording.wav"
prediction = detect_triggerword(filename)
chime_on_activate(filename, prediction, 0.5)
IPython.display.Audio("./chime_output.wav")
Here we can try your model on our own audio clips outisde of dev set!
# Preprocess the audio to the correct format
def preprocess_audio(filename):
# Trim or pad audio segment to 10000ms
padding = AudioSegment.silent(duration=10000)
segment = AudioSegment.from_wav(filename)[:10000]
segment = padding.overlay(segment)
# Set frame rate to 44100
segment = segment.set_frame_rate(44100)
# Export as wav
segment.export(filename, format='wav')
your_filename = "raw_data/dev/Recording.wav"
preprocess_audio(your_filename)
IPython.display.Audio(your_filename) # listen to the audio we uploaded
chime_threshold = 0.5
prediction = detect_triggerword(your_filename)
chime_on_activate(your_filename, prediction, chime_threshold)
IPython.display.Audio("./chime_output.wav")
So far our model can only take a static 10 seconds audio clip and make the prediction of the trigger word location.
Here is the fun part, let's replace with the live audio stream instead!
The model we have build expect 10 seconds audio clips as input. While training another model that takes shorter audio clips is possible but needs us retraining the model on a GPU for several hours.
We also don't want to wait for 10-second for the model tells us the trigger word is detected. So one solution is to have a moving 10 seconds audio stream window with a step size of 0.5 second. Which means we ask the model to predict every 0.5 seconds, that reduce the delay and make it responsive.
We also add the silence detection mechanism to skip prediction if the loudness is below a threshold, this can save some computing power.
The input 10 seconds audio is updated every 0.5 second. Meaning for every 0.5 second, the oldest 0.5 second chunk of audio will be discarded and the fresh 0.5 second audio will be shifted in. The job of the model is to tell if there is a new trigger word detected in the fresh 0.5 second audio chunk.
And here is the code to make it happen.
def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.5):
"""
Function to detect new trigger word in the latest chunk of input audio.
It is looking for the rising edge of the predictions data belongs to the
last/latest chunk.
Argument:
predictions -- predicted labels from model
chunk_duration -- time in second of a chunk
feed_duration -- time in second of the input to model
threshold -- threshold for probability above a certain to be considered positive
Returns:
True if new trigger word detected in the latest chunk
"""
predictions = predictions > threshold
chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
chunk_predictions = predictions[-chunk_predictions_samples:]
level = chunk_predictions[0]
for pred in chunk_predictions:
if pred > level:
return True
else:
level = pred
return False
To get the audio stream, we use the pyaudio library. Which has an option to read the audio stream asynchronously. That means the audio stream recording happens in another thread and when a new fixed length of audio data is available, it notifies our model to process it in the main thread.
You may ask why not just read a fixed length of audio and just process it in one function?
Since for the model to generate the prediction, it takes quite some time, sometimes measured in tens of milliseconds. By doing so, we are risking creating gaps in the audio stream while we are doing the computation.
Here is the code for the pyaudio library's callback, in the callback function we send a queue to notify the model to process the data in the main thread.
def detect_triggerword_spectrum(x):
"""
Function to predict the location of the trigger word.
Argument:
x -- spectrum of shape (freqs, Tx)
i.e. (Number of frequencies, The number time steps)
Returns:
predictions -- flattened numpy array to shape (number of output time steps)
"""
# the spectogram outputs and we want (Tx, freqs) to input into the model
x = x.swapaxes(0,1)
x = np.expand_dims(x, axis=0)
predictions = model.predict(x)
return predictions.reshape(-1)
def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.5):
"""
Function to detect new trigger word in the latest chunk of input audio.
It is looking for the rising edge of the predictions data belongs to the
last/latest chunk.
Argument:
predictions -- predicted labels from model
chunk_duration -- time in second of a chunk
feed_duration -- time in second of the input to model
threshold -- threshold for probability above a certain to be considered positive
Returns:
True if new trigger word detected in the latest chunk
"""
predictions = predictions > threshold
chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
chunk_predictions = predictions[-chunk_predictions_samples:]
level = chunk_predictions[0]
for pred in chunk_predictions:
if pred > level:
return True
else:
level = pred
return False
chunk_duration = 0.5 # Each read length in seconds from mic.
fs = 44100 # sampling rate for mic
chunk_samples = int(fs * chunk_duration) # Each read length in number of samples.
# Each model input data duration in seconds, need to be an integer numbers of chunk_duration
feed_duration = 10
feed_samples = int(fs * feed_duration)
assert feed_duration/chunk_duration == int(feed_duration/chunk_duration)
def get_spectrogram(data):
"""
Function to compute a spectrogram.
Argument:
predictions -- one channel / dual channel audio data as numpy array
Returns:
pxx -- spectrogram, 2-D array, columns are the periodograms of successive segments.
"""
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
nchannels = data.ndim
if nchannels == 1:
pxx, _, _ = mlab.specgram(data, nfft, fs, noverlap = noverlap)
elif nchannels == 2:
pxx, _, _ = mlab.specgram(data[:,0], nfft, fs, noverlap = noverlap)
return pxx
def plt_spectrogram(data):
"""
Function to compute and plot a spectrogram.
Argument:
predictions -- one channel / dual channel audio data as numpy array
Returns:
pxx -- spectrogram, 2-D array, columns are the periodograms of successive segments.
"""
nfft = 200 # Length of each window segment
fs = 8000 # Sampling frequencies
noverlap = 120 # Overlap between windows
nchannels = data.ndim
if nchannels == 1:
pxx, _, _, _ = plt.specgram(data, nfft, fs, noverlap = noverlap)
elif nchannels == 2:
pxx, _, _, _ = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
return pxx
def get_audio_input_stream(callback):
stream = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=1,
rate=fs,
input=True,
frames_per_buffer=chunk_samples,
input_device_index=0,
stream_callback=callback)
return stream
import pyaudio
from queue import Queue
from threading import Thread
import sys
import time
# Queue to communiate between the audio callback and main thread
q = Queue()
run = True
silence_threshold = 100
# Run the demo for a timeout seconds
timeout = time.time() + 0.5*60 # 0.5 minutes from now
# Data buffer for the input wavform
data = np.zeros(feed_samples, dtype='int16')
def callback(in_data, frame_count, time_info, status):
global run, timeout, data, silence_threshold
if time.time() > timeout:
run = False
data0 = np.frombuffer(in_data, dtype='int16')
if np.abs(data0).mean() < silence_threshold:
sys.stdout.write('-')
return (in_data, pyaudio.paContinue)
else:
sys.stdout.write('.')
data = np.append(data,data0)
if len(data) > feed_samples:
data = data[-feed_samples:]
# Process data async by sending a queue.
q.put(data)
return (in_data, pyaudio.paContinue)
stream = get_audio_input_stream(callback)
stream.start_stream()
try:
while run:
data = q.get()
spectrum = get_spectrogram(data)
preds = detect_triggerword_spectrum(spectrum)
new_trigger = has_new_triggerword(preds, chunk_duration, feed_duration)
if new_trigger:
sys.stdout.write('1')
except (KeyboardInterrupt, SystemExit):
stream.stop_stream()
stream.close()
timeout = time.time()
run = False
stream.stop_stream()
stream.close()
When we run it, it outputs one of the 3 characters every 0.5 second.
"-" means silence,
"." means not silence and no trigger word,
"1" means a new trigger word is detected.
stream.stop_stream()
stream.close()